import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
plt.style.use('ggplot')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
# connect to sqlite
conn = sqlite3.connect('liquor.db')
# make a cursor
cursor = conn.cursor()
# select data and transform it to df
#query = '''SELECT "Vendor name" FROM vendors where "Vendor number"=10;'''
query = '''SELECT * FROM stores ;
'''
cursor.execute(query)
result = cursor.fetchall()
cursor.close()
conn.close()
df = pd.DataFrame(result,columns = ['StoreID','StoreName','County','CountyID','City','ZipCode','Location','Address','StoreType'])
df.shape
df.head()
df = df.replace('missing',np.nan)
# Check store distribution among county
df.groupby('County').agg({'StoreID':'count'}).sort_values(by = 'StoreID',ascending=False).head(20).plot.bar()
df_hy = df[df.StoreName.str.contains('hy-vee')]
df_hy.groupby('County').agg({'StoreID':'count'}).sort_values(by = 'StoreID',ascending=False).plot.bar()
Seems like polk gets most stores
# Get county population data
county = pd.read_csv('data/County.csv')
county.County = county.County.map(lambda x:x.replace(' County','').lower())
set(df.County.unique()).difference(set(county.County.unique()))
# make consistent with county name
temp = pd.merge(df[['County','CountyID']],county,on='County',how = 'right').drop_duplicates()
dic = {x:y for x,y in zip(temp.CountyID, temp.County)}
dic[np.nan] = np.nan
df['County'] = [dic[x] for x in df.CountyID]
df.County.nunique()
df1 = pd.merge(df,county.drop('Rank',axis=1),on = 'County',how='left')
df1.shape
df1.head()
df1.groupby('County').agg({'StoreID':'count'}).sort_values(by='StoreID',ascending=False).head(20).plot.bar()
# Store per capital top 10 county bar plot
group = df1.groupby('County').agg({'StoreID':'count','Population':'first'})
group.Population = group.Population.map(lambda x:int(x.replace(',','')))
group['Store_per_capital'] = group.StoreID/group.Population
group.sort_values(by = 'Store_per_capital',ascending=False).head(20).plot.bar(y='Store_per_capital')
#plt.figure(figsize=(12,12))
sns.regplot(data=group,x='StoreID',y='Population')
Actually the County Population is positive correlated to numbers of stores in the county
Now let's look at hy-vee supermarket stores analysis of each county
conn = sqlite3.connect("liquor.db")
query = '''
select "Store Name",Date,"Volume Sold","Sale","County","County Number" from transactions
left join stores on transactions."Store Number" == stores."Store Number"
where stores."Store Name" like 'hy-vee%'
'''
tran = pd.read_sql_query(query, conn)
tran.head()
dic['missing'] = np.nan
tran['County'] = [dic[x] for x in tran["County Number"]]
tran.Date = pd.to_datetime(tran.Date)
store_hy = tran.groupby('Store Name').agg({'Date':lambda x:(x.max()-x.min()).days/365,'County':'first'})
store_hy.groupby('County').agg({'Date':'mean'}).sort_values(by='Date',ascending=False).plot.barh()
Some county has very stable stores but for some they are not
tran.groupby(['County']).agg({'Store Name':'nunique'}).sort_values(by='Store Name',ascending = False).plot.bar()
Polk seems to have most stores among 8 years
group3 = tran.groupby([pd.Grouper(key ='Date',freq='M')]).agg({'Store Name':'nunique',
'Volume Sold':'sum',
'Sale':'sum'})
group3 = group3.reset_index()
sns.lineplot(data = group3,x='Date',y='Store Name')
Seems like there is a big increase from 2018-2020
group2 = tran.groupby(['County',pd.Grouper(key ='Date',freq='M')]).agg({'Store Name':'nunique',
'Volume Sold':'sum',
'Sale':'sum'})
group2 = group2.reset_index()
plt.figure(figsize=(12,6))
sns.lineplot(data = group2,x='Date',y='Store Name',hue='County',legend='brief')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Polk seems to contribute most to the big increase, look at polk individually
polk = group2[group2['County']=='polk']
sns.lineplot(data = polk,x='Date',y='Store Name')
# volume sold over time
sns.lineplot(data = polk,x='Date',y='Volume Sold')
# Sale over time
sns.lineplot(data = polk,x='Date',y='Sale')
There is some missing location for some stores, but they have the address and county information. So we try to download their location using geopy
# get address for stores which miss location information
missing_loc = df[df.Location.isnull()]
missing_loc = missing_loc[~missing_loc.County.isnull()]
addresses = missing_loc.Address + ',' + missing_loc.County
# use geopy to download the location information
import geopy
from geopy import Nominatim
from geopy.extra.rate_limiter import RateLimiter
locator = Nominatim(user_agent="myGeocoder")
location = locator.geocode("504 south highway,pottawattamie")
location
geocode = RateLimiter(locator.geocode, min_delay_seconds=1.5)
locations = [geocode(addr+', Iowa, USA') if type(addr)==str else None for addr in addresses]
# get lat,long for missing data
missing_loc.Location = [x[1] if x!= None else x for x in locations ]
missing_loc = missing_loc[~missing_loc.Location.isnull()]
lat,long = zip(*[(float(x[0]),float(x[1])) for x in missing_loc.Location] )
missing_loc['long'] = long
missing_loc['lat'] = lat
missing_loc = missing_loc.drop(['Location'],axis=1)
# also get lat,long for not missing location stores
loca = df.loc[~df.Location.isnull(),:].copy()
loca['Location'] = loca.Location.map(lambda x : x.replace('POINT (','').replace(')','').strip().split(' '))
long,lat = zip(*[(float(x[0]),float(x[1])) for x in loca.Location] )
loca['long'] = long
loca['lat'] = lat
loca = loca.drop(['Location'],axis=1)
loca = pd.concat([loca,missing_loc],axis=0)
loca = loca[(loca.long>-104)&(loca.lat<44)]
loca['new_long'] = loca['long']*np.cos(42.05)
#loca.to_csv('loca.csv',index_label=False)
loca = pd.read_csv('data/loca.csv')
loca.plot.scatter(x='long',y='lat')
#plt.xlim([-98,-85])
samples = loca.loc[:,['new_long','lat']].reset_index(drop=True)
temp = loca.loc[:,['StoreName','long','lat','new_long','County']]
from sklearn.cluster import DBSCAN
from sklearn import metrics
rs = []
eps = np.arange(0.01,0.2,0.01)
min_samples = np.arange(20,400,1)
best_score = 0
best_score_eps = 0
best_score_min_samples = 0
for i in eps:
for j in min_samples:
try:
db = DBSCAN(eps=i,min_samples=j).fit(samples)
labels = db.labels_
# filter noise
dbscan = temp.copy()
dbscan['y_pred'] = labels
dbscan = dbscan[dbscan.y_pred>0]
new_samples = dbscan[['new_long','lat']]
new_labels = dbscan['y_pred']
# calculate metrics
k = metrics.silhouette_score(new_samples,new_labels)
ratios = len(labels[labels[:]==-1])/len(labels)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
rs.append([i,j,k,ratios,n_clusters])
if k>best_score:
best_score = k
best_score_eps = i
best_score_min_samples = j
except:
db = ''
rs = pd.DataFrame(rs,columns=['eps','min_samples','score','ratio','n_clusters'])
rs[(rs.score>0.8) &(rs.ratio<0.6)].sort_values(by='score',ascending=False).head(5)
# rs[(rs.score>0) & (rs.n_clusters<100)].sort_values(by='score',ascending=False)
best eps:0.06,min_samples:46
def plot_dbscan(eps,min_samples,include_noise=True):
y_pred = DBSCAN(eps = eps,min_samples=min_samples).fit_predict(samples)
dbscan = loca.copy()
dbscan['y_pred'] = y_pred
if not include_noise:
dbscan = dbscan[dbscan.y_pred>=0]
plt.figure(figsize=(12,6))
sns.scatterplot(dbscan['long'], dbscan['lat'], hue=dbscan.y_pred,legend='full',palette="Set1")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plot_dbscan(0.06,46,False)
Now we want to classify stores into urban and rural area, for those stores which are noise in dbscan we set them rural and for the other we set them uran, we want to compare the stores' longevity of urban and rural area
conn = sqlite3.connect("liquor.db")
query = '''
SELECT * FROM transactions
'''
tran = pd.read_sql_query(query, conn)
tran.head()
# get each store sale information and open time
tran.Date = pd.to_datetime(tran.Date)
long_g = tran.groupby(['Store Number']).agg({'Date': [(lambda x: (max(x)-min(x)).days/365),
(lambda x: (max(x).year-min(x).year)*12+(max(x).month-min(x).month)),
(lambda x:max(x).year),
(lambda x:min(x).year)],
'Volume Sold' : 'sum'
})
long_g.columns = ['days','month','max year','min year','volumn sold']
long_g['month'] = long_g['month'].replace(0,1)
long_g = long_g.reset_index()
long_g['volumn_sold_bymonth'] = long_g['volumn sold']/long_g['month']
#long_g.to_csv('long.csv',index_label=False)
long_g = pd.read_csv('data/long.csv')
# get rid of those stores which open less than 3 years ago
long_g= long_g[~((long_g['max year']==2020) & (long_g['min year'].isin(['2020','2019','2018'])))]
# classify
y_pred = DBSCAN(eps = 0.06,min_samples=46).fit_predict(samples)
dbscan = loca.copy()
dbscan['y_pred'] = y_pred
dbscan['area density'] = ['urban' if x!=-1 else 'rural' for x in dbscan.y_pred]
#dbscan.StoreID = dbscan.StoreID.map(str)
#dbscan.to_csv('location_cluster.csv',index_label=False)
# merge with long_g
final = pd.merge(long_g,dbscan[['StoreID','area density','StoreType']],left_on='Store Number',right_on='StoreID')
#
final.loc[final.StoreType.isin(['Casino','Other Grocery or Convenience']),'StoreType'] = 'Other'
final['area density'].value_counts()
final['StoreType'].value_counts()
group_store = final.groupby('area density').agg({ 'StoreType':'value_counts'})#.reset_index()
group_store.columns = ['count']
group_store = group_store.reset_index()
group_store
# Pie chart
labels = ['Convenience Store','Other','Liquor Tobacco Store','Supermarket','Drug Store']
sizes1 = group_store['count'][0:5]
sizes2 = group_store['count'][5:10]
fig1, ax1 = plt.subplots(2,1,figsize=(18,12))
ax1[0].pie(sizes1, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1[0].axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
ax1[0].set_title('Rural area')
ax1[1].pie(sizes2, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1[1].axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
ax1[1].set_title('Urban area')
plt.show()
final.groupby('area density').agg({'days':'mean', 'volumn_sold_bymonth':'mean'})
group = final.groupby(['StoreType','area density']).agg({ 'days':'mean','volumn_sold_bymonth':'mean','volumn sold':'mean'}).reset_index()
plt.figure(figsize=(12,6))
#sns.barplot(data=group,x='StoreType',y='days',hue='area density')
sns.barplot(data=final,x='StoreType',y='days',hue='area density')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.figure(figsize=(12,6))
sns.barplot(data=final,x='StoreType',y='volumn_sold_bymonth',hue='area density',order = ['Supermarket','Liquor Tobacco Store','Other','Drug Store','Convenience Store'])
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Open a supermarket --- urban area better for rural area open supermarket or liquir store
county.Population = county.Population.str.replace(',','').map(int)
def tran_name(x):
if len(x.split())==2:
return ' '.join(map(str.capitalize,x.split()))
elif len(x.split("'"))==2:
return "'".join(map(str.capitalize,x.split("'")))
else:
return x.capitalize()
county.County = county.County.map(tran_name)
locator = Nominatim(user_agent="myGeocoder")
location = locator.geocode("Iowa")
location
import folium
# Instantiate a feature group for the incidents in the dataframe
incidents_group = []
color = ['red','blue']
group = ['urban','rural']
for i in range(2):
incidents = folium.map.FeatureGroup()
temp = dbscan[dbscan['area density']==group[i]]
# Loop through the 200 crimes and add each to the incidents feature group
for lng, lat, in zip(temp.long, temp.lat):
incidents.add_child(
folium.CircleMarker(
[lat, lng],
radius=0.1, # define how big you want the circle markers to be
color=color[i],
fill=True,
fill_color=color[i],
fill_opacity=0.6
)
)
incidents_group.append(incidents)
url = 'https://public.opendatasoft.com/explore/dataset/us-county-boundaries/download/?format=geojson&refine.stusab=IA&timezone=America/New_York&lang=en'
url = 'lowa.geojson'
san_geo = f'{url}'
#bins = list(county['Population'].quantile(np.linspace(0,1,10)))
threshold_scale = np.linspace(county['Population'].min(),
county['Population'].max(),
10, dtype=int)
threshold_scale = threshold_scale.tolist() # change the numpy array to a list
threshold_scale[-1] = threshold_scale[-1] + 1 # make sure that the last value of the list is greater than the maximum immigration
m = folium.Map(location=[41.9216734, -93.3122705], zoom_start=8)
folium.Choropleth(
geo_data=san_geo,
data=county,
columns=['County','Population'],
key_on='feature.properties.name',
#fill_color='red',
fill_color='YlGn',
threshold_scale=threshold_scale,
fill_opacity=0.7,
line_opacity=0.2,
highlight=True,
legend_name='Population',
#bins=bins
).add_to(m)
for i in range(2):
m.add_child(incidents_group[i])
m